library(dplyr)
library(data.table)
library(ggplot2)
library(gridExtra)
library(viridis)
library(xtable)
"%&%" = function(a,b) paste(a,b,sep="")
date <- Sys.Date()
my.dir <- "/home/wheelerlab3/mesa_analyses/"
#my.dir <- "~/mount/wheelerlab3/mesa_analyses/"
#read in BSLMM
for(pop in c('AFA','CAU','HIS')){
  filelist <- scan(my.dir %&% "BSLMM_exp/" %&% pop %&% "filelist", "c")
  for(file in filelist){
    a <- fread(my.dir %&% "BSLMM_exp/" %&% file) %>%
      dplyr::mutate(pop=pop)
    if(exists('bslmm')){
      bslmm <- rbind(bslmm, a)
    }else{
      bslmm <- a
    }
  }
}

#read in BVSR
for(pop in c('AFA','CAU','HIS')){
  filelist <- scan(my.dir %&% "BVSR_exp/" %&% pop %&% "filelist", "c")
  for(file in filelist){
    a <- fread(my.dir %&% "BVSR_exp/" %&% file) %>%
      dplyr::mutate(pop=pop)
    if(exists('bvsr')){
      bvsr <- rbind(bvsr, a)
    }else{
      bvsr <- a
    }
  }
}
#join by ensg
all <- left_join(bslmm,bvsr,by=c('gene','pop'))

summary(lm(hh50~pve50,all))
## 
## Call:
## lm(formula = hh50 ~ pve50, data = all)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.248485 -0.006475  0.001618  0.008398  0.243683 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.0115639  0.0001198  -96.55   <2e-16 ***
## pve50        0.9918154  0.0005807 1707.91   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01563 on 29389 degrees of freedom
## Multiple R-squared:   0.99,  Adjusted R-squared:   0.99 
## F-statistic: 2.917e+06 on 1 and 29389 DF,  p-value: < 2.2e-16
summary(lm(hh50~pve50,all))$coefficients
##                Estimate   Std. Error    t value Pr(>|t|)
## (Intercept) -0.01156387 0.0001197765  -96.54541        0
## pve50        0.99181541 0.0005807203 1707.90564        0
cor.test(all$hh50,all$pve50)
## 
##  Pearson's product-moment correlation
## 
## data:  all$hh50 and all$pve50
## t = 1707.9, df = 29389, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9948848 0.9951129
## sample estimates:
##       cor 
## 0.9950001
cor.test(all$hh50,all$pve50,method='s')
## Warning in cor.test.default(all$hh50, all$pve50, method = "s"): Cannot
## compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  all$hh50 and all$pve50
## S = 1.0677e+11, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.9747682
#make plot like this for elastic net alpha comparison? LASSO vs. EN (colored by alpha=0.5 or 0.05)
# ggplot(all,aes(x=pve50,y=hh50)) + geom_point() + facet_wrap(~pop) + geom_abline(slope=1,intercept = 0,color='blue') + theme_bw(14) +
#   xlab('BSLMM PVE') + ylab('BVSR PVE')
# ggplot(all,aes(x=pve50,y=hh50,color=pge50)) + geom_point() + facet_wrap(~pop) + geom_abline(slope=1,intercept = 0)
# ggplot(all,aes(x=pve50,y=hh50,color=snp50)) + geom_point() + facet_wrap(~pop) + geom_abline(slope=1,intercept = 0) 
# ggplot(all,aes(x=pve50,y=hh50,color=n_gamma50)) + geom_point() + facet_wrap(~pop) + geom_abline(slope=1,intercept = 0)

cor.test(all$snp50,all$n_gamma50)
## 
##  Pearson's product-moment correlation
## 
## data:  all$snp50 and all$n_gamma50
## t = 13.643, df = 29389, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.06796058 0.09068185
## sample estimates:
##        cor 
## 0.07933152
cor.test(all$snp50,all$n_gamma50,method='s')
## Warning in cor.test.default(all$snp50, all$n_gamma50, method = "s"): Cannot
## compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  all$snp50 and all$n_gamma50
## S = 3.1348e+12, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.2591827
# ggplot(all,aes(x=n_gamma50, y=snp50)) + geom_density_2d() + facet_wrap(~pop)
# ggplot(all,aes(x=n_gamma50, y=snp50, color=hh50)) + geom_point() + facet_wrap(~pop)

data <- all %>% mutate(position=1:length(pve50),`medianSNPs<=10`=n_gamma50<=10,LCS=factor(pge025<=0.01,labels=c('> 0.01','<= 0.01')))
# ggplot(data,aes(x=pve50,y=pge50,ymin=pge025,ymax=pge975,col=LCS)) + geom_pointrange(col='gray') + geom_point() + theme_bw(12) + xlab("PVE") + ylab("PGE") + theme(legend.position = c(1,0),legend.justification = c(1,0)) + facet_wrap(~pop)

ngenes <- dim(all)[1]/3
sorted <- dplyr::arrange(all,pop,pve50) %>% mutate(order=rep(c(1:ngenes),3))
# ggplot(sorted,aes(x=order,y=pve50,ymin=pve025,ymax=pve975)) + geom_pointrange(col='gray') + geom_point() + facet_wrap(~pop)

sorted <- dplyr::arrange(all,pop,hh50) %>% mutate(order=rep(c(1:ngenes),3))
# ggplot(sorted,aes(x=order,y=hh50,ymin=hh025,ymax=hh975)) + geom_pointrange(col='gray') + geom_point() + facet_wrap(~pop)

BSLMM Sup Fig

pve <- dplyr::select(all, pop, gene, pve50, hh50)

h2_afa <- read.table(my.dir %&% "GCTA_exp/AFA_MESA_Nk-20.local-h2.2017-07-11.txt",header=T) %>% mutate(h2=local.h2,pop='AFA') %>% dplyr::select(ensid,h2,pop)
h2_cau <- read.table(my.dir %&% "GCTA_exp/CAU_MESA_Nk-20.local-h2.2017-07-11.txt",header=T) %>% mutate(h2=local.h2,pop='CAU') %>% dplyr::select(ensid,h2,pop)
h2_his <- read.table(my.dir %&% "GCTA_exp/HIS_MESA_Nk-20.local-h2.2017-07-11.txt",header=T) %>% mutate(h2=local.h2,pop='HIS') %>% dplyr::select(ensid,h2,pop)
h2_res <- rbind(h2_afa,h2_cau,h2_his)

afa <- fread("/home/lauren/files_for_revisions_plosgen/fst_results/fst_table_AFA.txt")
mean_afa <- afa[,list(R2_AFA=mean(R2_AFA, na.rm = TRUE), R2_CAU=mean(R2_CAU, na.rm = TRUE), 
  R2_HIS=mean(R2_HIS, na.rm = TRUE)),
  by=GENE]
r2_afa <- dplyr::select(mean_afa,GENE,R2_AFA) %>% mutate(R2=R2_AFA,pop="AFA") %>% dplyr::select(GENE,R2,pop)
r2_cau <- dplyr::select(mean_afa,GENE,R2_CAU) %>% mutate(R2=R2_CAU,pop="CAU") %>% dplyr::select(GENE,R2,pop)
r2_his <- dplyr::select(mean_afa,GENE,R2_HIS) %>% mutate(R2=R2_HIS,pop="HIS") %>% dplyr::select(GENE,R2,pop)
r2_res <- rbind(r2_afa,r2_cau,r2_his)

mega <- left_join(pve, h2_res, by=c('gene'='ensid','pop'))
## Warning: Column `gene`/`ensid` joining character vector and factor,
## coercing into character vector
mega <- left_join(mega,r2_res,by=c('gene'='GENE','pop'))

#rm NAs for plotting
mega <- mega[complete.cases(mega),]

###need to make h2 and R2 long to facet
b <- ggplot(mega,aes(x=pve50,y=h2,col=R2)) + geom_point() + facet_wrap(~pop) + geom_abline(slope=1,intercept = 0,color='blue') + theme_bw(14) + labs(x='BSLMM PVE',y=expression(paste('GCTA ', h^2)),col=expression(R^2),title='B') + theme(legend.position = c(0.995,0.005),legend.justification = c(1,0),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.3, "cm"),plot.margin=unit(c(0.1,0.2,0.1,0.2), "cm")) + scale_color_viridis()

#make plot like this for elastic net alpha comparison? LASSO vs. EN (colored by alpha=0.5 or 0.05)
c <- ggplot(mega,aes(x=pve50,y=hh50,col=R2)) + geom_point() + facet_wrap(~pop) + geom_abline(slope=1,intercept = 0,color='blue') + theme_bw(14) + labs(x='BSLMM PVE',y='BVSR PVE',col=expression(R^2),title='C') + theme(legend.position = c(0.995,0.005),legend.justification = c(1,0),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.3, "cm"),plot.margin=unit(c(0.1,0.2,0.1,0.2), "cm")) + 
  scale_color_viridis()

sfigA <- ggplot(data,aes(x=pve50,y=pge50,ymin=pge025,ymax=pge975,col=LCS)) + geom_pointrange(col='gray') + geom_point(shape=1) + theme_bw(14) + xlab("BLSMM PVE") + ylab("BSLMM PGE") + theme(legend.position = c(0.995,0.005),legend.justification = c(1,0),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.3, "cm"),plot.margin=unit(c(0.1,0.2,0.1,0.2), "cm")) + facet_wrap(~pop) + ggtitle('A') +
  scale_color_viridis(discrete=TRUE)

cor(mega[,-1:-2],method='s')
##           pve50      hh50        h2        R2
## pve50 1.0000000 0.9793485 0.9416620 0.8129126
## hh50  0.9793485 1.0000000 0.9362166 0.8213112
## h2    0.9416620 0.9362166 1.0000000 0.7905864
## R2    0.8129126 0.8213112 0.7905864 1.0000000

plot BSLMM and ENET

#read in alpha=1
for(pop in c('AFA','CAU','HIS')){
  a <- fread('/home/lauren/files_for_revisions_plosgen/en_v7/new_output_alphas/' %&% pop %&%
               '_nested_cv_all_model_summaries_10_peer_3pcs_a1.txt') %>%
    dplyr::mutate(pop=pop,test_R2_avg_1=ifelse(test_R2_avg<0,NA,test_R2_avg),cv_R2_avg_1=ifelse(cv_R2_avg<0,NA,cv_R2_avg), in_sample_R2_1=in_sample_R2) %>% dplyr::select(gene_id,pop,alpha,test_R2_avg_1,cv_R2_avg_1,in_sample_R2_1)
  if(exists('alpha1')){
    alpha1 <- rbind(alpha1, a)
  }else{
    alpha1 <- a
  }
}



summary(filter(alpha1,pop=='AFA'))
##    gene_id              pop                alpha   test_R2_avg_1  
##  Length:9695        Length:9695        Min.   :1   Min.   :0.000  
##  Class :character   Class :character   1st Qu.:1   1st Qu.:0.027  
##  Mode  :character   Mode  :character   Median :1   Median :0.085  
##                                        Mean   :1   Mean   :0.146  
##                                        3rd Qu.:1   3rd Qu.:0.209  
##                                        Max.   :1   Max.   :0.857  
##                                                    NA's   :6248   
##   cv_R2_avg_1    in_sample_R2_1   
##  Min.   :0.000   Min.   :0.00000  
##  1st Qu.:0.019   1st Qu.:0.00000  
##  Median :0.062   Median :0.07209  
##  Mean   :0.124   Mean   :0.14379  
##  3rd Qu.:0.172   3rd Qu.:0.22856  
##  Max.   :0.857   Max.   :0.92530  
##  NA's   :4750
summary(filter(alpha1,pop=='CAU'))
##    gene_id              pop                alpha   test_R2_avg_1  
##  Length:9695        Length:9695        Min.   :1   Min.   :0.000  
##  Class :character   Class :character   1st Qu.:1   1st Qu.:0.019  
##  Mode  :character   Mode  :character   Median :1   Median :0.066  
##                                        Mean   :1   Mean   :0.135  
##                                        3rd Qu.:1   3rd Qu.:0.184  
##                                        Max.   :1   Max.   :0.869  
##                                                    NA's   :4797   
##   cv_R2_avg_1    in_sample_R2_1    
##  Min.   :0.000   Min.   :0.000000  
##  1st Qu.:0.013   1st Qu.:0.002447  
##  Median :0.047   Median :0.049576  
##  Mean   :0.117   Mean   :0.108168  
##  3rd Qu.:0.153   3rd Qu.:0.143128  
##  Max.   :0.870   Max.   :0.881381  
##  NA's   :3664    NA's   :1
summary(filter(alpha1,pop=='HIS'))
##    gene_id              pop                alpha   test_R2_avg_1  
##  Length:9695        Length:9695        Min.   :1   Min.   :0.000  
##  Class :character   Class :character   1st Qu.:1   1st Qu.:0.023  
##  Mode  :character   Mode  :character   Median :1   Median :0.076  
##                                        Mean   :1   Mean   :0.145  
##                                        3rd Qu.:1   3rd Qu.:0.206  
##                                        Max.   :1   Max.   :0.888  
##                                                    NA's   :5236   
##   cv_R2_avg_1    in_sample_R2_1   
##  Min.   :0.000   Min.   :0.00000  
##  1st Qu.:0.018   1st Qu.:0.00000  
##  Median :0.057   Median :0.07737  
##  Mean   :0.124   Mean   :0.13724  
##  3rd Qu.:0.168   3rd Qu.:0.20203  
##  Max.   :0.887   Max.   :0.89397  
##  NA's   :3912
#read in alpha=0.05
for(pop in c('AFA','CAU','HIS')){
  a <- fread('/home/lauren/files_for_revisions_plosgen/en_v7/new_output_alphas/' %&% pop %&%
               '_nested_cv_all_model_summaries_10_peer_3pcs_a0.05.txt') %>%
    dplyr::mutate(pop=pop,test_R2_avg_05=ifelse(test_R2_avg<0,NA,test_R2_avg),cv_R2_avg_05=ifelse(cv_R2_avg<0,NA,cv_R2_avg), in_sample_R2_05=in_sample_R2) %>% dplyr::select(gene_id,pop,alpha,test_R2_avg_05, cv_R2_avg_05,in_sample_R2_05)
  if(exists('alpha05')){
    alpha05 <- rbind(alpha05, a)
  }else{
    alpha05 <- a
  }
}

summary(filter(alpha05,pop=='AFA'))
##    gene_id              pop                alpha      test_R2_avg_05 
##  Length:9695        Length:9695        Min.   :0.05   Min.   :0.000  
##  Class :character   Class :character   1st Qu.:0.05   1st Qu.:0.023  
##  Mode  :character   Mode  :character   Median :0.05   Median :0.075  
##                                        Mean   :0.05   Mean   :0.133  
##                                        3rd Qu.:0.05   3rd Qu.:0.188  
##                                        Max.   :0.05   Max.   :0.846  
##                                                       NA's   :6153   
##   cv_R2_avg_05   in_sample_R2_05  
##  Min.   :0.000   Min.   :0.00000  
##  1st Qu.:0.015   1st Qu.:0.00000  
##  Median :0.053   Median :0.08561  
##  Mean   :0.112   Mean   :0.15540  
##  3rd Qu.:0.155   3rd Qu.:0.24931  
##  Max.   :0.855   Max.   :0.96483  
##  NA's   :4555
summary(filter(alpha05,pop=='CAU'))
##    gene_id              pop                alpha      test_R2_avg_05 
##  Length:9695        Length:9695        Min.   :0.05   Min.   :0.000  
##  Class :character   Class :character   1st Qu.:0.05   1st Qu.:0.018  
##  Mode  :character   Mode  :character   Median :0.05   Median :0.064  
##                                        Mean   :0.05   Mean   :0.131  
##                                        3rd Qu.:0.05   3rd Qu.:0.176  
##                                        Max.   :0.05   Max.   :0.868  
##                                                       NA's   :4733   
##   cv_R2_avg_05   in_sample_R2_05   
##  Min.   :0.000   Min.   :0.000000  
##  1st Qu.:0.012   1st Qu.:0.003295  
##  Median :0.046   Median :0.052235  
##  Mean   :0.113   Mean   :0.110807  
##  3rd Qu.:0.148   3rd Qu.:0.146903  
##  Max.   :0.870   Max.   :0.882551  
##  NA's   :3616    NA's   :1
summary(filter(alpha05,pop=='HIS'))
##    gene_id              pop                alpha      test_R2_avg_05 
##  Length:9695        Length:9695        Min.   :0.05   Min.   :0.000  
##  Class :character   Class :character   1st Qu.:0.05   1st Qu.:0.023  
##  Mode  :character   Mode  :character   Median :0.05   Median :0.072  
##                                        Mean   :0.05   Mean   :0.137  
##                                        3rd Qu.:0.05   3rd Qu.:0.192  
##                                        Max.   :0.05   Max.   :0.881  
##                                                       NA's   :5092   
##   cv_R2_avg_05   in_sample_R2_05   
##  Min.   :0.000   Min.   :0.000000  
##  1st Qu.:0.016   1st Qu.:0.001975  
##  Median :0.054   Median :0.085076  
##  Mean   :0.118   Mean   :0.144545  
##  3rd Qu.:0.157   3rd Qu.:0.214201  
##  Max.   :0.882   Max.   :0.904676  
##  NA's   :3846
#read in alpha=0.5
for(pop in c('AFA','CAU','HIS')){
  a <- fread('/home/lauren/files_for_revisions_plosgen/en_v7/new_output/' %&% pop %&%
               '_nested_cv_all_model_summaries_10_peer_3pcs.txt') %>%
    dplyr::mutate(pop=pop,test_R2_avg_5=ifelse(test_R2_avg<0,NA,test_R2_avg),cv_R2_avg_5=ifelse(cv_R2_avg<0,NA,cv_R2_avg), in_sample_R2_5=in_sample_R2) %>% dplyr::select(gene_id,pop,alpha,test_R2_avg_5,cv_R2_avg_5,in_sample_R2_5)
  if(exists('alpha5')){
    alpha5 <- rbind(alpha5, a)
  }else{
    alpha5 <- a
  }
}

summary(filter(alpha5,pop=='AFA'))
##    gene_id              pop                alpha     test_R2_avg_5  
##  Length:9695        Length:9695        Min.   :0.5   Min.   :0.000  
##  Class :character   Class :character   1st Qu.:0.5   1st Qu.:0.025  
##  Mode  :character   Mode  :character   Median :0.5   Median :0.082  
##                                        Mean   :0.5   Mean   :0.145  
##                                        3rd Qu.:0.5   3rd Qu.:0.209  
##                                        Max.   :0.5   Max.   :0.860  
##                                                      NA's   :6209   
##   cv_R2_avg_5    in_sample_R2_5   
##  Min.   :0.000   Min.   :0.00000  
##  1st Qu.:0.018   1st Qu.:0.00000  
##  Median :0.061   Median :0.07534  
##  Mean   :0.123   Mean   :0.14635  
##  3rd Qu.:0.170   3rd Qu.:0.23285  
##  Max.   :0.860   Max.   :0.91871  
##  NA's   :4736
summary(filter(alpha5,pop=='CAU'))
##    gene_id              pop                alpha     test_R2_avg_5  
##  Length:9695        Length:9695        Min.   :0.5   Min.   :0.000  
##  Class :character   Class :character   1st Qu.:0.5   1st Qu.:0.019  
##  Mode  :character   Mode  :character   Median :0.5   Median :0.066  
##                                        Mean   :0.5   Mean   :0.135  
##                                        3rd Qu.:0.5   3rd Qu.:0.185  
##                                        Max.   :0.5   Max.   :0.864  
##                                                      NA's   :4795   
##   cv_R2_avg_5    in_sample_R2_5    
##  Min.   :0.000   Min.   :0.000000  
##  1st Qu.:0.013   1st Qu.:0.002378  
##  Median :0.047   Median :0.050166  
##  Mean   :0.116   Mean   :0.108682  
##  3rd Qu.:0.153   3rd Qu.:0.141928  
##  Max.   :0.873   Max.   :0.883050  
##  NA's   :3648    NA's   :1
summary(filter(alpha5,pop=='HIS'))
##    gene_id              pop                alpha     test_R2_avg_5  
##  Length:9695        Length:9695        Min.   :0.5   Min.   :0.000  
##  Class :character   Class :character   1st Qu.:0.5   1st Qu.:0.024  
##  Mode  :character   Mode  :character   Median :0.5   Median :0.078  
##                                        Mean   :0.5   Mean   :0.145  
##                                        3rd Qu.:0.5   3rd Qu.:0.206  
##                                        Max.   :0.5   Max.   :0.887  
##                                                      NA's   :5238   
##   cv_R2_avg_5    in_sample_R2_5   
##  Min.   :0.000   Min.   :0.00000  
##  1st Qu.:0.017   1st Qu.:0.00000  
##  Median :0.056   Median :0.07713  
##  Mean   :0.123   Mean   :0.13836  
##  3rd Qu.:0.169   3rd Qu.:0.20596  
##  Max.   :0.887   Max.   :0.90349  
##  NA's   :3900
enet <- left_join(alpha05,alpha5,by=c("gene_id","pop"))
enet <- left_join(enet,alpha1,by=c("gene_id","pop"))

giga <- left_join(enet,mega,by=c("gene_id"="gene","pop"))

#rm(alpha05,alpha5,alpha1)
ggplot(giga, aes(x=test_R2_avg_5,y=test_R2_avg_05)) + geom_point() + facet_wrap(~pop) +
  geom_abline(slope=1,intercept=0,col='blue') + theme_bw()
## Warning: Removed 17733 rows containing missing values (geom_point).

table(giga$test_R2_avg_5 >= giga$test_R2_avg_05,useNA="i")
## 
## FALSE  TRUE  <NA> 
##  4919  6433 17733
ggplot(giga, aes(x=test_R2_avg_5,y=test_R2_avg_1)) + geom_point() + facet_wrap(~pop) +
  geom_abline(slope=1,intercept=0,col='blue') + theme_bw()  
## Warning: Removed 17814 rows containing missing values (geom_point).

table(giga$test_R2_avg_05 >= giga$test_R2_avg_1,useNA="i")
## 
## FALSE  TRUE  <NA> 
##  6337  4946 17802
ggplot(giga, aes(x=test_R2_avg_5,y=test_R2_avg_1)) + geom_point() + facet_wrap(~pop) +
  geom_abline(slope=1,intercept=0,col='blue')
## Warning: Removed 17814 rows containing missing values (geom_point).

table(giga$test_R2_avg_5 >= giga$test_R2_avg_1,useNA="i")
## 
## FALSE  TRUE  <NA> 
##  5685  5586 17814
ggplot(giga, aes(x=test_R2_avg_1,y=test_R2_avg_1-test_R2_avg_05)) + geom_point() + facet_wrap(~pop) +
  geom_hline(yintercept = 0,col='blue')
## Warning: Removed 17802 rows containing missing values (geom_point).

ggplot(giga, aes(x=test_R2_avg_1,y=test_R2_avg_1-test_R2_avg_5)) + geom_point() + facet_wrap(~pop) +
  geom_hline(yintercept = 0,col='blue')
## Warning: Removed 17814 rows containing missing values (geom_point).

#bland-altman
ba1 <- left_join(alpha1,alpha5,by=c('gene_id','pop')) %>% mutate(diffR2=test_R2_avg_1-test_R2_avg_5) %>%
  select(gene_id,pop,alpha.y,test_R2_avg_1,diffR2)
ba2 <- left_join(alpha1,alpha05,by=c('gene_id','pop')) %>% mutate(diffR2=test_R2_avg_1-test_R2_avg_05) %>%
  select(gene_id,pop,alpha.y,test_R2_avg_1,diffR2)
ba <- rbind(ba1,ba2) %>% mutate(alpha = factor(alpha.y,levels=c('0.05','0.5')),gt0=diffR2>0) 
fig4a <- ggplot(ba, aes(x=test_R2_avg_1,y=diffR2,col=alpha)) + geom_point(shape=1) + 
  scale_color_viridis(discrete=TRUE) + labs(x=expression(paste('lasso ',R^2)),y=expression(paste(R^2, ' difference (lasso - ',alpha,')')),col=expression(alpha),title='A') +  geom_hline(yintercept = 0,col='darkgray') +  coord_cartesian(ylim=c(-0.6,0.6)) +
  facet_wrap(~pop) + theme_bw(14) + theme(legend.justification=c(0,0), legend.position=c(0.9,0.01),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.4, "cm"))

ggplot(ba, aes(x=test_R2_avg_1,y=diffR2,col=alpha)) + geom_density_2d() +
  scale_color_viridis(discrete=TRUE) + labs(x=expression(paste('lasso ',R^2)),y=expression(paste(R^2, ' difference (lasso - ',alpha,')')),col=expression(alpha)) + coord_cartesian(ylim=c(-0.5,0.5)) +
  geom_hline(yintercept = 0,col='darkgray') + 
  facet_wrap(~pop) + theme_bw(14) + theme(legend.justification=c(0,0), legend.position=c(0.9,0.01),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.4, "cm"))
## Warning: Removed 35616 rows containing non-finite values (stat_density2d).

fig4b <- ggplot(ba, aes(x=test_R2_avg_1,y=diffR2,col=alpha)) + geom_density_2d() +
  scale_color_viridis(discrete=TRUE) + labs(x=expression(paste('lasso ',R^2, ' ZOOM')),y=expression(paste(R^2, ' difference (lasso - ',alpha,')')),col=expression(alpha),title='B') + 
  geom_hline(yintercept = 0,col='darkgray') + coord_cartesian(ylim=c(-0.08,0.08)) +
  facet_wrap(~pop) + theme_bw(14) + theme(legend.justification=c(0,0), legend.position=c(0.9,0.01),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.4, "cm"))

#make table of diffR2 > 0 counts/percents
for(p in c('AFA','CAU','HIS')){
  ba_sub <- dplyr::filter(ba,pop==p,alpha==0.05)
  poscount05 <- table(ba_sub$diffR2>0)[[2]]
  negcount05 <- table(ba_sub$diffR2<0)[[2]]
  total05 <- sum(table(ba_sub$diffR2>0))
  prop05 <- signif(poscount05/total05,3)
  ba_sub <- dplyr::filter(ba,pop==p,alpha==0.5)
  poscount5 <- table(ba_sub$diffR2>0)[[2]]
  negcount5 <- table(ba_sub$diffR2<0)[[2]]
  total5 <- sum(table(ba_sub$diffR2>0))
  prop5 <- signif(poscount5/total5,3)
  restab <- c(p,poscount05,total05,prop05,poscount5,total5,prop5)
  if(exists('diffR2_tab')){
    diffR2_tab <- rbind(diffR2_tab, restab)
  }else{
    diffR2_tab <- restab
  }
}
colnames(diffR2_tab) <- c("pop","poscount_05","total_05","prop_05","poscount_5","total_5","prop_5")
print(diffR2_tab)
##            pop   poscount_05 total_05 prop_05 poscount_5 total_5 prop_5 
## diffR2_tab "AFA" "1655"      "2828"   "0.585" "1485"     "2862"  "0.519"
## restab     "CAU" "2525"      "4481"   "0.563" "2226"     "4465"  "0.499"
## restab     "HIS" "2157"      "3974"   "0.543" "1974"     "3944"  "0.501"
xtable(diffR2_tab)
## Warning in data.row.names(row.names, rowsi, i): some row.names duplicated:
## 3 --> row.names NOT used
## % latex table generated in R 3.4.4 by xtable 1.8-2 package
## % Sat May 19 11:32:05 2018
## \begin{table}[ht]
## \centering
## \begin{tabular}{rlllllll}
##   \hline
##  & pop & poscount\_05 & total\_05 & prop\_05 & poscount\_5 & total\_5 & prop\_5 \\ 
##   \hline
## 1 & AFA & 1655 & 2828 & 0.585 & 1485 & 2862 & 0.519 \\ 
##   2 & CAU & 2525 & 4481 & 0.563 & 2226 & 4465 & 0.499 \\ 
##   3 & HIS & 2157 & 3974 & 0.543 & 1974 & 3944 & 0.501 \\ 
##    \hline
## \end{tabular}
## \end{table}

BSLMM plot with test_avg_R2

#c
ggplot(giga,aes(x=pve50,y=hh50,col=test_R2_avg_5)) + geom_point() + facet_wrap(~pop) + geom_abline(slope=1,intercept = 0,color='blue') + theme_bw(14) + labs(x='BSLMM PVE',y='BVSR PVE',col=expression(R^2),title='C') + theme(legend.position = c(0.995,0.005),legend.justification = c(1,0),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.3, "cm"),plot.margin=unit(c(0.1,0.2,0.1,0.2), "cm")) + 
  scale_color_viridis()
## Warning: Removed 2805 rows containing missing values (geom_point).

#a
ggplot(data,aes(x=pve50,y=pge50,ymin=pge025,ymax=pge975,col=LCS)) + geom_pointrange(col='gray') + geom_point() + theme_bw(14) + xlab("BLSMM PVE") + ylab("BSLMM PGE") + theme(legend.position = c(0.995,0.005),legend.justification = c(1,0),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.3, "cm"),plot.margin=unit(c(0.1,0.2,0.1,0.2), "cm")) + facet_wrap(~pop) + ggtitle('A') +
  scale_color_viridis(discrete=TRUE)

#b
ggplot(giga,aes(x=pve50,y=h2,col=test_R2_avg_5)) + geom_point() + facet_wrap(~pop) + geom_abline(slope=1,intercept = 0,color='blue') + theme_bw(14) + labs(x='BSLMM PVE',y=expression(paste('GCTA ', h^2)),col=expression(R^2),title='B') + theme(legend.position = c(0.995,0.005),legend.justification = c(1,0),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.3, "cm"),plot.margin=unit(c(0.1,0.2,0.1,0.2), "cm")) + scale_color_viridis()
## Warning: Removed 2805 rows containing missing values (geom_point).

ggplot(giga,aes(x=test_R2_avg_5,y=test_R2_avg_05,col=pve50)) + geom_point() + facet_wrap(~pop) + geom_abline(slope=1,intercept = 0,color='blue') + theme_bw(14) + labs(x=expression(paste(R^2, " (",alpha, ' = ',0.5,')')),y=expression(paste(R^2, " (",alpha, ' = ',0.05,')')),col='BSLMM\nPVE',title='B') + theme(legend.position = c(0.995,0.005),legend.justification = c(1,0),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.3, "cm"),plot.margin=unit(c(0.1,0.2,0.1,0.2), "cm")) + scale_color_viridis()
## Warning: Removed 17733 rows containing missing values (geom_point).

ggplot(giga,aes(x=test_R2_avg_5,y=test_R2_avg_05,col=h2)) + geom_point() + facet_wrap(~pop) + geom_abline(slope=1,intercept = 0,color='blue') + theme_bw(14) + labs(x=expression(paste(R^2, " (",alpha, ' = ',0.5,')')),y=expression(paste(R^2, " (",alpha, ' = ',0.05,')')),col='GCTA\nh2',title='B') + theme(legend.position = c(0.995,0.005),legend.justification = c(1,0),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.3, "cm"),plot.margin=unit(c(0.1,0.2,0.1,0.2), "cm")) + scale_color_viridis()
## Warning: Removed 17733 rows containing missing values (geom_point).

ggplot(giga,aes(x=test_R2_avg_5,y=test_R2_avg_1,col=pve50)) + geom_point() + facet_wrap(~pop) + geom_abline(slope=1,intercept = 0,color='blue') + theme_bw(14) + labs(x=expression(paste(R^2, " (",alpha, ' = ',0.5,')')),y=expression(paste(R^2, " (",alpha, ' = ',1,')')),col='BSLMM\nPVE',title='B') + theme(legend.position = c(0.995,0.005),legend.justification = c(1,0),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.3, "cm"),plot.margin=unit(c(0.1,0.2,0.1,0.2), "cm")) + scale_color_viridis()
## Warning: Removed 17814 rows containing missing values (geom_point).

ggplot(giga,aes(x=test_R2_avg_1,y=test_R2_avg_05,col=pve50)) + geom_point() + facet_wrap(~pop) + geom_abline(slope=1,intercept = 0,color='blue') + theme_bw(14) + labs(x=expression(paste(R^2, " (",alpha, ' = ',1,')')),y=expression(paste(R^2, " (",alpha, ' = ',0.05,')')),col='BSLMM\nPVE',title='B') + theme(legend.position = c(0.995,0.005),legend.justification = c(1,0),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.3, "cm"),plot.margin=unit(c(0.1,0.2,0.1,0.2), "cm")) + scale_color_viridis()
## Warning: Removed 17802 rows containing missing values (geom_point).

keygiga <- dplyr::select(giga,test_R2_avg_05,test_R2_avg_5,test_R2_avg_1,pve50,hh50,h2)
cor(keygiga, use='pairwise',method='s')
##                test_R2_avg_05 test_R2_avg_5 test_R2_avg_1     pve50
## test_R2_avg_05      1.0000000     0.9567743     0.9529598 0.8959922
## test_R2_avg_5       0.9567743     1.0000000     0.9616368 0.8984249
## test_R2_avg_1       0.9529598     0.9616368     1.0000000 0.8998705
## pve50               0.8959922     0.8984249     0.8998705 1.0000000
## hh50                0.9033259     0.9053237     0.9065704 0.9793485
## h2                  0.8674913     0.8594325     0.8593600 0.9416620
##                     hh50        h2
## test_R2_avg_05 0.9033259 0.8674913
## test_R2_avg_5  0.9053237 0.8594325
## test_R2_avg_1  0.9065704 0.8593600
## pve50          0.9793485 0.9416620
## hh50           1.0000000 0.9362166
## h2             0.9362166 1.0000000
abvsr <- left_join(alpha5,mega,by=c('gene_id'='gene','pop')) %>% mutate(model='BVSR',PVE=hh50)
abslmm <- left_join(alpha5,mega,by=c('gene_id'='gene','pop')) %>% mutate(model='BSLMM',PVE=pve50)
agcta <- left_join(alpha5,mega,by=c('gene_id'='gene','pop')) %>% mutate(model='LMM',PVE=h2)
aplot <- rbind(abvsr, abslmm, agcta)
ggplot(aplot, aes(x=test_R2_avg_5,y=PVE,col=model)) + geom_smooth() +
  facet_wrap(~pop) + theme_bw() + scale_color_viridis(discrete=TRUE)
## `geom_smooth()` using method = 'gam'
## Warning: Removed 48798 rows containing non-finite values (stat_smooth).

bvsrba1 <- mutate(mega,diffPVE=hh50-pve50,model='BSLMM') %>% dplyr::select(pop,gene,hh50,diffPVE,model)
bvsrba2 <- mutate(mega,diffPVE=hh50-h2,model='LMM') %>% dplyr::select(pop,gene,hh50,diffPVE,model)
bvsrba <- rbind(bvsrba1,bvsrba2) %>% mutate(model=factor(model,levels=c('LMM','BSLMM')))
sfigB <- ggplot(bvsrba,aes(x=hh50,y=diffPVE,col=model)) + geom_point(shape=1) + facet_wrap(~pop) +
scale_color_viridis(discrete=TRUE) + labs(x='BVSR PVE',y='PVE difference (BVSR - model)',title='B') +
  geom_hline(yintercept = 0,col='darkgray') + 
  theme_bw(14) + theme(legend.justification=c(0,0), legend.position=c(0.88,0.7),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.3, "cm"),legend.background = element_rect(fill="white",size=0.5, linetype="solid", colour ="black"))


ggplot(bvsrba,aes(x=hh50,y=diffPVE,col=model)) + geom_density_2d() + facet_wrap(~pop) +
scale_color_viridis(discrete=TRUE) + labs(x='BVSR PVE',y='PVE difference (BVSR - model)') +
  geom_hline(yintercept = 0,col='darkgray') + coord_cartesian(ylim=c(-.4,0.6)) +
  theme_bw(14) + theme(legend.justification=c(0,0), legend.position=c(0.88,0.65),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.3, "cm"),legend.background = element_rect(fill="white",size=0.5, linetype="solid", colour ="black"))

sfigC <- ggplot(bvsrba,aes(x=hh50,y=diffPVE,col=model)) + geom_density_2d() + facet_wrap(~pop) +
scale_color_viridis(discrete=TRUE) + labs(x='BVSR PVE',y='PVE difference (BVSR - model)',title='C') +
  geom_hline(yintercept = 0,col='darkgray') + 
  theme_bw(14) + theme(legend.justification=c(0,0), legend.position=c(0.88,0.7),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.3, "cm"),legend.background = element_rect(fill="white",size=0.5, linetype="solid", colour ="black"))
print(sfigC)

#make table of diffPVE > 0 counts/percents
for(p in c('AFA','CAU','HIS')){
  ba_sub <- dplyr::filter(bvsrba,pop==p,model=='LMM')
  poscountLMM <- table(ba_sub$diffPVE>0)[[2]]
  totalLMM <- sum(table(ba_sub$diffPVE>0))
  propLMM <- signif(poscountLMM/totalLMM,3)
  ba_sub <- dplyr::filter(bvsrba,pop==p,model=='BSLMM')
  poscountBSLMM <- table(ba_sub$diffPVE>0)[[2]]
  totalBSLMM <- sum(table(ba_sub$diffPVE>0))
  propBSLMM <- signif(poscountBSLMM/totalBSLMM,3)
  restab <- c(p,poscountLMM,totalLMM,propLMM,poscountBSLMM,totalBSLMM,propBSLMM)
  if(exists('diffPVE_tab_bvsr')){
    diffPVE_tab_bvsr <- rbind(diffPVE_tab_bvsr, restab)
  }else{
    diffPVE_tab_bvsr <- restab
  }
}
colnames(diffPVE_tab_bvsr) <- c("pop","poscount_LMM","total_LMM","prop_LMM","poscount_BSLMM","total_BSLMM","prop_BSLMM")
print(diffPVE_tab_bvsr)
##                  pop   poscount_LMM total_LMM prop_LMM poscount_BSLMM
## diffPVE_tab_bvsr "AFA" "5175"       "9172"    "0.564"  "1233"        
## restab           "CAU" "4329"       "8567"    "0.505"  "1172"        
## restab           "HIS" "4345"       "8541"    "0.509"  "1121"        
##                  total_BSLMM prop_BSLMM
## diffPVE_tab_bvsr "9172"      "0.134"   
## restab           "8567"      "0.137"   
## restab           "8541"      "0.131"
xtable(diffPVE_tab_bvsr)
## Warning in data.row.names(row.names, rowsi, i): some row.names duplicated:
## 3 --> row.names NOT used
## % latex table generated in R 3.4.4 by xtable 1.8-2 package
## % Sat May 19 11:32:26 2018
## \begin{table}[ht]
## \centering
## \begin{tabular}{rlllllll}
##   \hline
##  & pop & poscount\_LMM & total\_LMM & prop\_LMM & poscount\_BSLMM & total\_BSLMM & prop\_BSLMM \\ 
##   \hline
## 1 & AFA & 5175 & 9172 & 0.564 & 1233 & 9172 & 0.134 \\ 
##   2 & CAU & 4329 & 8567 & 0.505 & 1172 & 8567 & 0.137 \\ 
##   3 & HIS & 4345 & 8541 & 0.509 & 1121 & 8541 & 0.131 \\ 
##    \hline
## \end{tabular}
## \end{table}

do BSLMM on x-axis

bslmmba1 <- mutate(mega,diffPVE=pve50-hh50,model='BVSR') %>% dplyr::select(pop,gene,pve50,diffPVE,model)
bslmmba2 <- mutate(mega,diffPVE=pve50-h2,model='LMM') %>% dplyr::select(pop,gene,pve50,diffPVE,model)
bslmmba <- rbind(bslmmba1,bslmmba2) %>% mutate(model=factor(model,levels=c('LMM','BVSR')))
sfigB_bslmm <- ggplot(bslmmba,aes(x=pve50,y=diffPVE,col=model)) + geom_point(shape=1) + facet_wrap(~pop) +
scale_color_viridis(discrete=TRUE) + labs(x='BSLMM PVE',y='PVE difference (BSLMM - model)',title='B') +
  geom_hline(yintercept = 0,col='darkgray') +  coord_cartesian(ylim=c(-0.65,0.65)) +
  theme_bw(14) + theme(legend.justification=c(0,0), legend.position=c(0.88,0.05),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.3, "cm"),legend.background = element_rect(fill="white",size=0.5, linetype="solid", colour ="black"))
print(sfigB_bslmm)

ggplot(bslmmba,aes(x=pve50,y=diffPVE,col=model)) + geom_density_2d() + facet_wrap(~pop) +
scale_color_viridis(discrete=TRUE) + labs(x='BSLMM PVE',y='PVE difference (BSLMM - model)') +
  geom_hline(yintercept = 0,col='darkgray') + coord_cartesian(ylim=c(-.4,0.6)) +
  theme_bw(14) + theme(legend.justification=c(0,0), legend.position=c(0.88,0.65),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.3, "cm"),legend.background = element_rect(fill="white",size=0.5, linetype="solid", colour ="black"))

sfigC_bslmm <- ggplot(bslmmba,aes(x=pve50,y=diffPVE,col=model)) + geom_density_2d() + facet_wrap(~pop) +
scale_color_viridis(discrete=TRUE) + labs(x='BSLMM PVE',y='PVE difference (BSLMM - model)',title='C') +
  geom_hline(yintercept = 0,col='darkgray') + coord_cartesian(ylim=c(-0.06,0.06)) +
  theme_bw(14) + theme(legend.justification=c(0,0), legend.position=c(0.88,0.05),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.3, "cm"),legend.background = element_rect(fill="white",size=0.5, linetype="solid", colour ="black"))
print(sfigC_bslmm)

#make table of diffPVE > 0 counts/percents
for(p in c('AFA','CAU','HIS')){
  ba_sub <- dplyr::filter(bslmmba,pop==p,model=='LMM')
  poscountLMM <- table(ba_sub$diffPVE>0)[[2]]
  totalLMM <- sum(table(ba_sub$diffPVE>0))
  propLMM <- signif(poscountLMM/totalLMM,3)
  ba_sub <- dplyr::filter(bslmmba,pop==p,model=='BVSR')
  poscountBSLMM <- table(ba_sub$diffPVE>0)[[2]]
  totalBSLMM <- sum(table(ba_sub$diffPVE>0))
  propBSLMM <- signif(poscountBSLMM/totalBSLMM,3)
  restab <- c(p,poscountLMM,totalLMM,propLMM,poscountBSLMM,totalBSLMM,propBSLMM)
  if(exists('diffPVE_tab_bslmm')){
    diffPVE_tab_bslmm <- rbind(diffPVE_tab_bslmm, restab)
  }else{
    diffPVE_tab_bslmm <- restab
  }
}
colnames(diffPVE_tab_bslmm) <- c("pop","poscount_LMM","total_LMM","prop_LMM","poscount_BVSR","total_BVSR","prop_BVSR")
print(diffPVE_tab_bslmm)
##                   pop   poscount_LMM total_LMM prop_LMM poscount_BVSR
## diffPVE_tab_bslmm "AFA" "6620"       "9172"    "0.722"  "7939"       
## restab            "CAU" "6499"       "8567"    "0.759"  "7395"       
## restab            "HIS" "6081"       "8541"    "0.712"  "7420"       
##                   total_BVSR prop_BVSR
## diffPVE_tab_bslmm "9172"     "0.866"  
## restab            "8567"     "0.863"  
## restab            "8541"     "0.869"
xtable(diffPVE_tab_bslmm)
## Warning in data.row.names(row.names, rowsi, i): some row.names duplicated:
## 3 --> row.names NOT used
## % latex table generated in R 3.4.4 by xtable 1.8-2 package
## % Sat May 19 11:32:32 2018
## \begin{table}[ht]
## \centering
## \begin{tabular}{rlllllll}
##   \hline
##  & pop & poscount\_LMM & total\_LMM & prop\_LMM & poscount\_BVSR & total\_BVSR & prop\_BVSR \\ 
##   \hline
## 1 & AFA & 6620 & 9172 & 0.722 & 7939 & 9172 & 0.866 \\ 
##   2 & CAU & 6499 & 8567 & 0.759 & 7395 & 8567 & 0.863 \\ 
##   3 & HIS & 6081 & 8541 & 0.712 & 7420 & 8541 & 0.869 \\ 
##    \hline
## \end{tabular}
## \end{table}
#put alpha=0.5 on x axis
ba1 <- left_join(alpha5,alpha1,by=c('gene_id','pop')) %>% mutate(diffR2=test_R2_avg_5-test_R2_avg_1) %>%
  select(gene_id,pop,alpha.y,test_R2_avg_5,diffR2)
ba2 <- left_join(alpha5,alpha05,by=c('gene_id','pop')) %>% mutate(diffR2=test_R2_avg_5-test_R2_avg_05) %>%
  select(gene_id,pop,alpha.y,test_R2_avg_5,diffR2)
ba <- rbind(ba1,ba2) %>% mutate(alpha = factor(alpha.y,levels=c('1','0.05')),gt0=diffR2>0) 
ggplot(ba, aes(x=test_R2_avg_5,y=diffR2,col=alpha)) + geom_point(shape=1) + 
  scale_color_viridis(discrete=TRUE) + labs(x=expression(paste(alpha,' = 0.5 ',R^2)),y=expression(paste(R^2, ' difference ( ',alpha, ' = 0.5 - ',alpha,')')),col=expression(alpha),title='A') +  geom_hline(yintercept = 0,col='darkgray') + 
  facet_wrap(~pop) + theme_bw(14) + theme(legend.justification=c(0,0), legend.position=c(0.9,0.01),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.4, "cm"))
## Warning: Removed 35547 rows containing missing values (geom_point).

ggplot(ba, aes(x=test_R2_avg_5,y=diffR2,col=alpha)) + geom_density_2d() +
  scale_color_viridis(discrete=TRUE) + labs(x=expression(paste(alpha, ' = 0.5 ',R^2, ' ZOOM')),y=expression(paste(R^2, ' difference ( ',alpha,' = 0.5 -  ',alpha,')')),col=expression(alpha),title='B') + 
  geom_hline(yintercept = 0,col='darkgray') + 
  facet_wrap(~pop) + theme_bw(14) + theme(legend.justification=c(0,0), legend.position=c(0.9,0.01),legend.title = element_text(size=10),legend.text = element_text(size=10),legend.key.size = unit(0.4, "cm"))
## Warning: Removed 35547 rows containing non-finite values (stat_density2d).

Fig 4

grid.arrange(fig4a, fig4b, nrow=2)
## Warning: Removed 35616 rows containing missing values (geom_point).
## Warning: Removed 35616 rows containing non-finite values (stat_density2d).

BVSR SFig

grid.arrange(sfigA, sfigB, sfigC, nrow=3)

BSLMM SFig

grid.arrange(sfigA, sfigB_bslmm, sfigC_bslmm, nrow=3)